Merge "Add missing file locks to deleteArchivedFiles.php"
authorjenkins-bot <jenkins-bot@gerrit.wikimedia.org>
Thu, 21 Jul 2016 18:23:24 +0000 (18:23 +0000)
committerGerrit Code Review <gerrit@wikimedia.org>
Thu, 21 Jul 2016 18:23:24 +0000 (18:23 +0000)
autoload.php
composer.json
includes/Title.php
includes/collation/IcuCollation.php
includes/deferred/AtomicSectionUpdate.php
includes/deferred/AutoCommitUpdate.php [new file with mode: 0644]
includes/filerepo/file/LocalFile.php
includes/tidy/Balancer.php
tests/phpunit/includes/tidy/BalancerTest.php

index ecbc9b3..76a329f 100644 (file)
@@ -157,6 +157,7 @@ $wgAutoloadLocalClasses = [
        'AuthManagerSpecialPage' => __DIR__ . '/includes/specialpage/AuthManagerSpecialPage.php',
        'AuthPlugin' => __DIR__ . '/includes/AuthPlugin.php',
        'AuthPluginUser' => __DIR__ . '/includes/AuthPlugin.php',
+       'AutoCommitUpdate' => __DIR__ . '/includes/deferred/AutoCommitUpdate.php',
        'AutoLoader' => __DIR__ . '/includes/AutoLoader.php',
        'AutoloadGenerator' => __DIR__ . '/includes/utils/AutoloadGenerator.php',
        'Autopromote' => __DIR__ . '/includes/Autopromote.php',
index d5bf93c..54eb3c0 100644 (file)
@@ -40,7 +40,7 @@
                "wikimedia/relpath": "1.0.3",
                "wikimedia/running-stat": "1.1.0",
                "wikimedia/utfnormal": "1.0.3",
-               "wikimedia/wrappedstring": "2.1.1",
+               "wikimedia/wrappedstring": "2.2.0",
                "zordius/lightncandy": "0.23"
        },
        "require-dev": {
index 62f4060..8aa8cb7 100644 (file)
@@ -4367,18 +4367,23 @@ class Title implements LinkTarget {
                        return true; // avoid gap locking if we know it's not there
                }
 
-               $method = __METHOD__;
-               $dbw = wfGetDB( DB_MASTER );
                $conds = $this->pageCond();
-               $dbw->onTransactionIdle( function () use ( $dbw, $conds, $method, $purgeTime ) {
-                       $dbTimestamp = $dbw->timestamp( $purgeTime ?: time() );
-                       $dbw->update(
-                               'page',
-                               [ 'page_touched' => $dbTimestamp ],
-                               $conds + [ 'page_touched < ' . $dbw->addQuotes( $dbTimestamp ) ],
-                               $method
-                       );
-               } );
+               DeferredUpdates::addUpdate(
+                       new AutoCommitUpdate(
+                               wfGetDB( DB_MASTER ),
+                               __METHOD__,
+                               function ( IDatabase $dbw, $fname ) use ( $conds, $purgeTime ) {
+                                       $dbTimestamp = $dbw->timestamp( $purgeTime ?: time() );
+                                       $dbw->update(
+                                               'page',
+                                               [ 'page_touched' => $dbTimestamp ],
+                                               $conds + [ 'page_touched < ' . $dbw->addQuotes( $dbTimestamp ) ],
+                                               $fname
+                                       );
+                               }
+                       ),
+                       DeferredUpdates::PRESEND
+               );
 
                return true;
        }
index b956d4b..c2e8b24 100644 (file)
@@ -451,6 +451,13 @@ class IcuCollation extends Collation {
                $versionPrefix = substr( $icuVersion, 0, 3 );
                // Source: http://site.icu-project.org/download
                $map = [
+                       '57.' => '8.0',
+                       '56.' => '8.0',
+                       '55.' => '7.0',
+                       '54.' => '7.0',
+                       '53.' => '6.3',
+                       '52.' => '6.3',
+                       '51.' => '6.2',
                        '50.' => '6.2',
                        '49.' => '6.1',
                        '4.8' => '6.0',
index a9921b3..ccbd6b0 100644 (file)
@@ -9,26 +9,34 @@ class AtomicSectionUpdate implements DeferrableUpdate {
        private $dbw;
        /** @var string */
        private $fname;
-       /** @var Closure|callable */
+       /** @var callable */
        private $callback;
 
        /**
         * @param IDatabase $dbw
         * @param string $fname Caller name (usually __METHOD__)
         * @param callable $callback
-        * @throws InvalidArgumentException
         * @see IDatabase::doAtomicSection()
         */
-       public function __construct( IDatabase $dbw, $fname, $callback ) {
+       public function __construct( IDatabase $dbw, $fname, callable $callback ) {
                $this->dbw = $dbw;
                $this->fname = $fname;
-               if ( !is_callable( $callback ) ) {
-                       throw new InvalidArgumentException( 'Not a valid callback/closure!' );
-               }
                $this->callback = $callback;
+
+               if ( $this->dbw->trxLevel() ) {
+                       $this->dbw->onTransactionResolution( [ $this, 'cancelOnRollback' ] );
+               }
        }
 
        public function doUpdate() {
-               $this->dbw->doAtomicSection( $this->fname, $this->callback );
+               if ( $this->callback ) {
+                       $this->dbw->doAtomicSection( $this->fname, $this->callback );
+               }
+       }
+
+       public function cancelOnRollback( $trigger ) {
+               if ( $trigger === IDatabase::TRIGGER_ROLLBACK ) {
+                       $this->callback = null;
+               }
        }
 }
diff --git a/includes/deferred/AutoCommitUpdate.php b/includes/deferred/AutoCommitUpdate.php
new file mode 100644 (file)
index 0000000..ddf2bb8
--- /dev/null
@@ -0,0 +1,56 @@
+<?php
+
+/**
+ * Deferrable Update for closure/callback updates that should use auto-commit mode
+ * @since 1.28
+ */
+class AutoCommitUpdate implements DeferrableUpdate {
+       /** @var IDatabase */
+       private $dbw;
+       /** @var string */
+       private $fname;
+       /** @var callable */
+       private $callback;
+
+       /**
+        * @param IDatabase $dbw
+        * @param string $fname Caller name (usually __METHOD__)
+        * @param callable $callback Callback that takes (IDatabase, method name string)
+        */
+       public function __construct( IDatabase $dbw, $fname, callable $callback ) {
+               $this->dbw = $dbw;
+               $this->fname = $fname;
+               $this->callback = $callback;
+
+               if ( $this->dbw->trxLevel() ) {
+                       $this->dbw->onTransactionResolution( [ $this, 'cancelOnRollback' ] );
+               }
+       }
+
+       public function doUpdate() {
+               if ( !$this->callback ) {
+                       return;
+               }
+
+               $autoTrx = $this->dbw->getFlag( DBO_TRX );
+               $this->dbw->clearFlag( DBO_TRX );
+               try {
+                       /** @var Exception $e */
+                       $e = null;
+                       call_user_func_array( $this->callback, [ $this->dbw, $this->fname ] );
+               } catch ( Exception $e ) {
+               }
+               if ( $autoTrx ) {
+                       $this->dbw->setFlag( DBO_TRX );
+               }
+               if ( $e ) {
+                       throw $e;
+               }
+       }
+
+       public function cancelOnRollback( $trigger ) {
+               if ( $trigger === IDatabase::TRIGGER_ROLLBACK ) {
+                       $this->callback = null;
+               }
+       }
+}
index cab9316..234dbac 100644 (file)
@@ -1637,16 +1637,20 @@ class LocalFile extends File {
                // Purge the source and target files...
                $oldTitleFile = wfLocalFile( $this->title );
                $newTitleFile = wfLocalFile( $target );
-               // Hack: the lock()/unlock() pair is nested in a transaction so the locking is not
-               // tied to BEGIN/COMMIT. To avoid slow purges in the transaction, move them outside.
-               $this->getRepo()->getMasterDB()->onTransactionIdle(
-                       function () use ( $oldTitleFile, $newTitleFile, $archiveNames ) {
-                               $oldTitleFile->purgeEverything();
-                               foreach ( $archiveNames as $archiveName ) {
-                                       $oldTitleFile->purgeOldThumbnails( $archiveName );
+               // To avoid slow purges in the transaction, move them outside...
+               DeferredUpdates::addUpdate(
+                       new AutoCommitUpdate(
+                               $this->getRepo()->getMasterDB(),
+                               __METHOD__,
+                               function () use ( $oldTitleFile, $newTitleFile, $archiveNames ) {
+                                       $oldTitleFile->purgeEverything();
+                                       foreach ( $archiveNames as $archiveName ) {
+                                               $oldTitleFile->purgeOldThumbnails( $archiveName );
+                                       }
+                                       $newTitleFile->purgeEverything();
                                }
-                               $newTitleFile->purgeEverything();
-                       }
+                       ),
+                       DeferredUpdates::PRESEND
                );
 
                if ( $status->isOK() ) {
@@ -1682,7 +1686,7 @@ class LocalFile extends File {
 
                $this->lock(); // begin
                $batch->addCurrent();
-               # Get old version relative paths
+               // Get old version relative paths
                $archiveNames = $batch->addOlds();
                $status = $batch->execute();
                $this->unlock(); // done
@@ -1691,16 +1695,19 @@ class LocalFile extends File {
                        DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [ 'images' => -1 ] ) );
                }
 
-               // Hack: the lock()/unlock() pair is nested in a transaction so the locking is not
-               // tied to BEGIN/COMMIT. To avoid slow purges in the transaction, move them outside.
-               $that = $this;
-               $this->getRepo()->getMasterDB()->onTransactionIdle(
-                       function () use ( $that, $archiveNames ) {
-                               $that->purgeEverything();
-                               foreach ( $archiveNames as $archiveName ) {
-                                       $that->purgeOldThumbnails( $archiveName );
+               // To avoid slow purges in the transaction, move them outside...
+               DeferredUpdates::addUpdate(
+                       new AutoCommitUpdate(
+                               $this->getRepo()->getMasterDB(),
+                               __METHOD__,
+                               function () use ( $archiveNames ) {
+                                       $this->purgeEverything();
+                                       foreach ( $archiveNames as $archiveName ) {
+                                               $this->purgeOldThumbnails( $archiveName );
+                                       }
                                }
-                       }
+                       ),
+                       DeferredUpdates::PRESEND
                );
 
                // Purge the CDN
index 0fa96bd..b2d6ba1 100644 (file)
@@ -43,6 +43,11 @@ use \Sanitizer;
 # as soon as possible (usually as soon as the tag is closed) to reduce
 # its memory footprint.
 
+# We've been gradually lifting some of these restrictions to handle
+# non-sanitized output generated by extensions, but we shortcut the tokenizer
+# for speed (primarily by splitting on `<`) and so rely on syntactic
+# well-formedness.
+
 # On the other hand, I've been pretty careful to note with comments in the
 # code the places where this implementation omits features of the spec or
 # depends on the MediaWiki Sanitizer.  Perhaps in the future we'll want to
@@ -70,7 +75,7 @@ class BalanceSets {
                self::HTML_NAMESPACE => [
                        'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
                        'frame' => true,
-                       'plaintext' => true, 'isindex' => true, 'textarea' => true,
+                       'plaintext' => true, 'isindex' => true,
                        'xmp' => true, 'iframe' => true, 'noembed' => true,
                        'noscript' => true, 'script' => true,
                        'title' => true
@@ -87,6 +92,12 @@ class BalanceSets {
                ]
        ];
 
+       public static $extraLinefeedSet = [
+               self::HTML_NAMESPACE => [
+                       'pre' => true, 'textarea' => true, 'listing' => true,
+               ]
+       ];
+
        public static $headingSet = [
                self::HTML_NAMESPACE => [
                        'h1' => true, 'h2' => true, 'h3' => true,
@@ -508,11 +519,21 @@ class BalanceElement {
                }
                if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
                        $out = "<{$this->localName}{$encAttribs}>";
+                       $len = strlen( $out );
                        // flatten children
                        foreach ( $this->children as $elt ) {
                                $out .= "{$elt}";
                        }
                        $out .= "</{$this->localName}>";
+                       if (
+                               $this->isA( BalanceSets::$extraLinefeedSet ) &&
+                               $out[$len] === "\n"
+                       ) {
+                               // Double the linefeed after pre/listing/textarea
+                               // according to the HTML5 fragment serialization algorithm.
+                               $out = substr( $out, 0, $len + 1 ) .
+                                       substr( $out, $len );
+                       }
                } else {
                        $out = "<{$this->localName}{$encAttribs} />";
                        Assert::invariant(
@@ -676,19 +697,29 @@ class BalanceStack implements IteratorAggregate {
                return $out;
        }
 
+       /**
+        * Insert a comment at the appropriate place for inserting a node.
+        * @param string $value Content of the comment.
+        * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
+        */
+       public function insertComment( $value ) {
+               // Just another type of text node, except for tidy p-wrapping.
+               return $this->insertText( '<!--' . $value . '-->', true );
+       }
+
        /**
         * Insert text at the appropriate place for inserting a node.
         * @param string $value
         * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
         */
-       public function insertText( $value ) {
+       public function insertText( $value, $isComment = false ) {
                if (
                        $this->fosterParentMode &&
                        $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
                ) {
                        $this->fosterParent( $value );
                } elseif (
-                       $this->tidyCompat &&
+                       $this->tidyCompat && !$isComment &&
                        $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
                ) {
                        $this->insertHTMLELement( 'mw:p-wrap', [] );
@@ -1725,17 +1756,19 @@ class BalanceActiveFormattingElements {
  * - The document is never in "quirks mode".
  * - All occurrences of < and > have been entity escaped, so we
  *   can parse tags by simply splitting on those two characters.
+ *   (This also simplifies the handling of < inside <textarea>.)
+ *   The character < must not appear inside comments.
  *   Similarly, all attributes have been "cleaned" and are double-quoted
  *   and escaped.
- * - All comments and null characters are assumed to have been removed.
- * - We don't alter linefeeds after <pre>/<listing>.
+ * - All null characters are assumed to have been removed.
  * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- *   <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
+ *   <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
  *   <noembed>, <noscript>, <script>, <title>.  As a result,
  *   further simplifications can be made:
  *   - `frameset-ok` is not tracked.
  *   - `head element pointer` is not tracked (but presumed non-null)
- *   - Tokenizer has only a single mode.
+ *   - Tokenizer has only a single mode. (<textarea> wants RCDATA and
+ *     <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
  *
  *   We generally mark places where we omit cases from the spec due to
  *   disallowed elements with a comment: `# OMITTED: <element-name>`.
@@ -1757,12 +1790,47 @@ class Balancer {
        private $stack;
        private $strict;
        private $tidyCompat;
+       private $allowComments;
 
-       private $textIntegrationMode = false;
+       private $textIntegrationMode;
        private $pendingTableText;
        private $originalInsertionMode;
        private $fragmentContext;
        private $formElementPointer;
+       private $ignoreLinefeed;
+       private $inRCDATA;
+       private $inRAWTEXT;
+
+       /**
+        * Valid HTML5 comments.
+        * Regex borrowed from Tim Starling's "remex-html" project.
+        */
+       const VALID_COMMENT_REGEX = "~ !--
+               (                             # 1. Comment match detector
+                       > | -> | # Invalid short close
+                       (                         # 2. Comment contents
+                               (?:
+                                       (?! --> )
+                                       (?! --!> )
+                                       (?! --! \z )
+                                       (?! -- \z )
+                                       (?! - \z )
+                                       .
+                               )*+
+                       )
+                       (                         # 3. Comment close
+                               --> |   # Normal close
+                               --!> |  # Comment end bang
+                               (                     # 4. Indicate matches requiring EOF
+                                       --! |   # EOF in comment end bang state
+                                       -- |    # EOF in comment end state
+                                       -  |    # EOF in comment end dash state
+                                               # EOF in comment state
+                               )
+                       )
+               )
+               ([^<]*) \z                    # 5. Non-tag text after the comment
+               ~xs";
 
        /**
         * Create a new Balancer.
@@ -1782,16 +1850,23 @@ class Balancer {
         *         program: <p>-wrapping is done to the children of
         *         <body> and <blockquote> elements, and empty elements
         *         are removed.
+        *     'allowComments': boolean, defaults to true.
+        *         When true, allows HTML comments in the input.
+        *         The Sanitizer generally strips all comments, so if you
+        *         are running on sanitized output you can set this to
+        *         false to get a bit more performance.
         */
        public function __construct( array $config = [] ) {
                $config = $config + [
                        'strict' => false,
                        'allowedHtmlElements' => null,
                        'tidyCompat' => false,
+                       'allowComments' => true,
                ];
                $this->allowedHtmlElements = $config['allowedHtmlElements'];
                $this->strict = $config['strict'];
                $this->tidyCompat = $config['tidyCompat'];
+               $this->allowComments = $config['allowComments'];
                if ( $this->allowedHtmlElements !== null ) {
                        # Sanity check!
                        $bad = array_uintersect_assoc(
@@ -1835,6 +1910,11 @@ class Balancer {
                $this->processingCallback = $processingCallback;
                $this->processingArgs = $processingArgs;
 
+               $this->textIntegrationMode =
+                       $this->ignoreLinefeed =
+                       $this->inRCDATA =
+                       $this->inRAWTEXT = false;
+
                # The stack is constructed with an <html> element already on it.
                # Set this up as a fragment parsed with <body> as the context.
                $this->fragmentContext =
@@ -1887,6 +1967,19 @@ class Balancer {
                        # Don't actually inject the empty string as a text token.
                        return true;
                }
+               // Support pre/listing/textarea by suppressing initial linefeed
+               if ( $this->ignoreLinefeed ) {
+                       $this->ignoreLinefeed = false;
+                       if ( $token === 'text' ) {
+                               if ( $value[0] === "\n" ) {
+                                       if ( $value === "\n" ) {
+                                               # Nothing would be left, don't inject the empty string.
+                                               return true;
+                                       }
+                                       $value = substr( $value, 1 );
+                               }
+                       }
+               }
                // Some hoops we have to jump through
                $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
 
@@ -2030,12 +2123,27 @@ class Balancer {
 
        /**
         * Grab the next "token" from $bitsIterator.  This is either a open/close
-        * tag or text, depending on whether the Sanitizer approves.
+        * tag or text or a comment, depending on whether the Sanitizer approves.
         */
        private function advance() {
                $x = $this->bitsIterator->current();
                $this->bitsIterator->next();
                $regs = [];
+               # Handle comments.  These won't be generated by mediawiki (they
+               # are stripped in the Sanitizer) but may be generated by extensions.
+               if (
+                       $this->allowComments &&
+                       !( $this->inRCDATA || $this->inRAWTEXT ) &&
+                       preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
+                       /* verify EOF condition where necessary */
+                       ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
+               ) {
+                       $contents = $regs[2][0];
+                       $rest = $regs[5][0];
+                       $this->insertToken( 'comment', $contents );
+                       $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
+                       return;
+               }
                # $slash: Does the current element start with a '/'?
                # $t: Current element name
                # $attribStr: String between element name and >
@@ -2060,6 +2168,22 @@ class Balancer {
                        $slash = $t = $attribStr = $brace = $rest = null;
                }
                $goodtag = $t;
+               if ( $this->inRCDATA ) {
+                       if ( $slash && $t === $this->inRCDATA ) {
+                               $this->inRCDATA = false;
+                       } else {
+                               // No tags allowed; this emulates the "rcdata" tokenizer mode.
+                               $goodtag = false;
+                       }
+               }
+               if ( $this->inRAWTEXT ) {
+                       if ( $slash && $t === $this->inRAWTEXT ) {
+                               $this->inRAWTEXT = false;
+                       } else {
+                               // No tags allowed, no entity-escaping done.
+                               $goodtag = false;
+                       }
+               }
                $sanitize = $this->allowedHtmlElements !== null;
                if ( $sanitize ) {
                        $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
@@ -2086,6 +2210,8 @@ class Balancer {
                if ( $goodtag ) {
                        $rest = str_replace( '>', '&gt;', $rest );
                        $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
+               } elseif ( $this->inRAWTEXT ) {
+                       $this->insertToken( 'text', "<$x" );
                } else {
                        # bad tag; serialize entire thing as text.
                        $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
@@ -2191,7 +2317,7 @@ class Balancer {
 
        private function parseRawText( $value, $attribs = null ) {
                $this->stack->insertHTMLElement( $value, $attribs );
-               // XXX switch tokenizer to rawtext state?
+               $this->inRAWTEXT = $value;
                $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
                return true;
        }
@@ -2272,6 +2398,9 @@ class Balancer {
                                // ignore any other end tag
                                return true;
                        }
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                }
 
                // If not handled above
@@ -2359,9 +2488,8 @@ class Balancer {
                                        $this->inBodyMode( 'endtag', 'p' );
                                }
                                $this->stack->insertHTMLElement( $value, $attribs );
-                               # As described in "simplifications" above:
-                               # 1. We don't touch the next token, even if it's a linefeed.
-                               # 2. OMITTED: frameset_ok
+                               $this->ignoreLinefeed = true;
+                               # OMITTED: frameset_ok
                                return true;
 
                        case 'form':
@@ -2535,7 +2663,14 @@ class Balancer {
                                return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
 
                        # OMITTED: <isindex>
-                       # OMITTED: <textarea>
+
+                       case 'textarea':
+                               $this->stack->insertHTMLElement( $value, $attribs );
+                               $this->ignoreLinefeed = true;
+                               $this->inRCDATA = $value; // emulate rcdata tokenizer mode
+                               # OMITTED: frameset_ok
+                               return true;
+
                        # OMITTED: <xmp>
                        # OMITTED: <iframe>
                        # OMITTED: <noembed>
@@ -2782,6 +2917,9 @@ class Balancer {
                                }
                        }
                        return true;
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                } else {
                        Assert::invariant( false, "Bad token type: $token" );
                }
@@ -2885,6 +3023,9 @@ class Balancer {
                                return $this->inHeadMode( $token, $value, $attribs, $selfclose );
                        }
                        // Fall through for "anything else" clause.
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                }
                // This is the "anything else" case:
                $this->stack->fosterParentMode = true;
@@ -3012,6 +3153,9 @@ class Balancer {
                        // Fall through for "anything else".
                } elseif ( $token === 'eof' ) {
                        return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                }
 
                // Anything else
@@ -3289,6 +3433,9 @@ class Balancer {
                        case 'template':
                                return $this->inHeadMode( $token, $value, $attribs, $selfclose );
                        }
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                }
                // anything else: just ignore the token
                return true;
@@ -3320,7 +3467,7 @@ class Balancer {
        }
 
        private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
-               if ( $token === 'text' ) {
+               if ( $token === 'text' || $token === 'comment' ) {
                        return $this->inBodyMode( $token, $value, $attribs, $selfclose );
                } elseif ( $token === 'eof' ) {
                        if ( $this->stack->indexOf( 'template' ) < 0 ) {
index 213982a..f69ecaf 100644 (file)
@@ -15,18 +15,12 @@ class BalancerTest extends MediaWikiTestCase {
                        'strict' => false, /* not strict */
                        'allowedHtmlElements' => null, /* no sanitization */
                        'tidyCompat' => false, /* standard parser */
+                       'allowComments' => true, /* comment parsing */
                ] );
        }
 
        /**
-        * Anything cleanup you need to do should go here.
-        */
-       protected function tearDown() {
-               parent::tearDown();
-       }
-
-       /**
-        * @covers Balancer::balance
+        * @covers MediaWiki\Tidy\Balancer::balance
         * @dataProvider provideBalancerTests
         */
        public function testBalancer( $description, $input, $expected ) {
@@ -47,15 +41,16 @@ class BalancerTest extends MediaWikiTestCase {
                // for providers, and filter out HTML constructs which
                // the balancer doesn't support.
                $tests = [];
-               $start = '<html><head></head><body>';
-               $end = '</body></html>';
+               $okre = "~ \A
+                       (?i:<!DOCTYPE\ html>)?
+                       <html><head></head><body>
+                       .*
+                       </body></html>
+               \z ~xs";
                foreach ( $json as $filename => $cases ) {
                        foreach ( $cases as $case ) {
                                $html = $case['document']['html'];
-                               if (
-                                       substr( $html, 0, strlen( $start ) ) !== $start ||
-                                       substr( $html, -strlen( $end ) ) !== $end
-                               ) {
+                               if ( !preg_match( $okre, $html ) ) {
                                        // Skip tests which involve stuff in the <head> or
                                        // weird doctypes.
                                        continue;
@@ -69,21 +64,32 @@ class BalancerTest extends MediaWikiTestCase {
                                $html = $case['document']['noQuirksBodyHtml'];
                                // Normalize case of SVG attributes.
                                $html = str_replace( 'foreignObject', 'foreignobject', $html );
+                               // Normalize case of MathML attributes.
+                               $html = str_replace( 'definitionURL', 'definitionurl', $html );
 
-                               if ( isset( $case['document']['props']['comment'] ) ) {
-                                       // Skip tests which include HTML comments, which
-                                       // the balancer requires to have been stripped.
+                               if (
+                                       isset( $case['document']['props']['comment'] ) &&
+                                       preg_match( ',<!--[^>]*<,', $html )
+                               ) {
+                                       // Skip tests which include HTML comments containing
+                                       // the < character, which we don't support.
                                        continue;
                                }
                                if ( strpos( $case['data'], '<![CDATA[' ) !== false ) {
                                        // Skip tests involving <![CDATA[ ]]> quoting.
                                        continue;
                                }
-                               if ( stripos( $case['data'], '<!DOCTYPE' ) !== false ) {
-                                       // Skip tests involving doctypes.
+                               if (
+                                       stripos( $case['data'], '<!DOCTYPE' ) !== false &&
+                                       stripos( $case['data'], '<!DOCTYPE html>' ) === false
+                               ) {
+                                       // Skip tests involving unusual doctypes.
                                        continue;
                                }
-                               if ( preg_match( ',</?(html|head|body|frame|plaintext)>|<rdar:|<isindex,i', $case['data'] ) ) {
+                               $literalre = "~ <rdar: | <isindex | < /? (
+                                       html | head | body | frame | frameset | plaintext
+                               ) > ~xi";
+                               if ( preg_match( $literalre, $case['data'] ) ) {
                                        // Skip tests involving some literal tags, which are
                                        // unsupported but don't show up in the expected output.
                                        continue;
@@ -95,7 +101,6 @@ class BalancerTest extends MediaWikiTestCase {
                                        isset( $case['document']['props']['tags']['script'] ) ||
                                        isset( $case['document']['props']['tags']['svg script'] ) ||
                                        isset( $case['document']['props']['tags']['svg title'] ) ||
-                                       isset( $case['document']['props']['tags']['textarea'] ) ||
                                        isset( $case['document']['props']['tags']['title'] ) ||
                                        isset( $case['document']['props']['tags']['xmp'] )
                                ) {
@@ -116,7 +121,8 @@ class BalancerTest extends MediaWikiTestCase {
                                        isset( $case['document']['props']['tagWithLt'] ) ||
                                        isset( $case['document']['props']['attrWithFunnyChar'] ) ||
                                        preg_match( ':^(</b test|<di|<foo bar=qux/>)$:', $case['data'] ) ||
-                                       preg_match( ':</p<p>:', $case['data'] )
+                                       preg_match( ':</p<p>:', $case['data'] ) ||
+                                       preg_match( ':<b &=&amp>|<p/x/y/z>:', $case['data'] )
                                ) {
                                        // Skip tests with funny tag or attribute names,
                                        // which are really tests of the HTML tokenizer, not
@@ -124,7 +130,7 @@ class BalancerTest extends MediaWikiTestCase {
                                        continue;
                                }
                                if (
-                                       stripos( $case['data'], 'encoding=" text/html "' ) !== false
+                                       preg_match( ':encoding=" text/html "|type=" hidden":', $case['data'] )
                                ) {
                                        // The Sanitizer normalizes whitespace in attribute
                                        // values, which makes this test case invalid.
@@ -134,9 +140,12 @@ class BalancerTest extends MediaWikiTestCase {
                                        // Skip tests with ASCII null, etc.
                                        continue;
                                }
+                               $data = preg_replace(
+                                       '~<!DOCTYPE html>~i', '', $case['data']
+                               );
                                $tests[] = [
                                        $filename, # use better description?
-                                       $case['data'],
+                                       $data,
                                        $html
                                ];
                        }